# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Validating our measurementes against dictionary-based measurements 
#'          using CHES data	
#' @author Hauke Licht
#' 
#' @note    Internet access needed to run this script (to download CHES data)
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup ----

# load packages
library(readr)
library(dplyr)
library(cld2)
library(purrr)
library(lubridate)
library(quanteda) # v3.3.1
library(multidictR) # renv::install("jogrue/regexhelpeR@v1.1.0")
library(regexhelpeR) # renv::install("jogrue/multidictR@v0.9.0")
library(popdictR) # renv::install("jogrue/popdictR@v0.9.0")

# data paths
base_path <- file.path(".")
data_path <- file.path(base_path, "data")
output_path <- file.path(data_path, "output")


# read data ----

# read party codes mapping
party_codes <- read_csv(file.path(data_path, "exdata", "party_codes_mapping.csv"))

# labeled parl. party tweets
all_tweets_labeled <- read_rds(file.path(output_path, "parl_party_tweets_labeled.rds"))

# create corpus ----

de_tweets <- all_tweets_labeled %>% 
  # subset to German-speaking countries
  filter(country_iso3c %in% c("DEU", "AUT", "CHE")) %>% 
  # subset to political tweets
  filter(political == "yes") %>% 
  # subset to German tweets
  mutate(lang_guess = cld2::detect_language(text)) %>% 
  filter(!is.na(lang_guess) & lang_guess == "de") %>% 
  # subset to relevant columns
  transmute(
    country_iso3c, 
    party_id, party_name_short, user_id, 
    status_id, created_at, 
    text, 
    prob_elitecriticism,
    doc_id_ = row_number()
  )

corp <- corpus(de_tweets, docid_field = "doc_id_", text_field = "text")

desc <- list()
desc$n_tweets <- nrow(de_tweets)
desc$n_tweets_by_country <- table(de_tweets$country_iso3c)
desc$n_tweets_by_party <- count(de_tweets, country_iso3c, party_id, party_name_short)

# apply dictionaries ----

# 1) Gründl (2020) ----

gruendl_antielitism_terms <- popdictR::gruendl_dictionary_complete %>%
  filter(Include == 1) %>% 
  filter(Type3 == "anti-elitism") %>% 
  pull(Word)

desc$dictionaries$gruendl <- gruendl_antielitism_terms

res <- list()

# takes ~3 minutes
s <- Sys.time()
res$dict_matches_gruendl <- dict_matches_gruendl <- multidictR::run_multidict(
  corpus = corp, 
  dict = gruendl_antielitism_terms,
  dict_name = "antielite_gruendl",
  at_level = "documents", 
  return_value = "binary",
  include_totals = TRUE,
  pattern_type = "regex",
  case_insensitive = TRUE, 
  tolower = TRUE,
  stem = FALSE, 
  what = "word", 
  remove_punct = TRUE,
  remove_symbols = TRUE, 
  remove_numbers = TRUE, 
  remove_url = TRUE,
  remove_separators = TRUE, 
  split_hyphens = FALSE, 
  include_docvars = TRUE
)
(Sys.time()-s)
# not warnings raised suggest that it might make sense to downgrade quantda


# 2) Rooduijn/Pauwels (2014) ----

desc$dictionaries$rooduijn <- popdictR::rooduijn_2011_german_terms

s <- Sys.time()
res$dict_matches_rooduijn <- multidictR::run_multidict(
  corpus = corp, 
  dict = popdictR::rooduijn_2011_german_terms,
  dict_name = "antielite_rooduijn",
  at_level = "documents", 
  return_value = "binary",
  include_totals = TRUE,
  pattern_type = "glob",
  case_insensitive = TRUE, 
  tolower = TRUE,
  stem = FALSE, 
  what = "word", 
  remove_punct = TRUE,
  remove_symbols = TRUE, 
  remove_numbers = TRUE, 
  remove_url = TRUE,
  remove_separators = TRUE, 
  split_hyphens = FALSE, 
  include_docvars = TRUE
)
(Sys.time()-s)

# combine data ----

dict_matches_df <- res %>% 
  map(quanteda:::convert, to = "data.frame") %>% 
  map(as_tibble) %>% 
  map(select, -starts_with("n_")) %>% 
  setNames(c("x", "y")) %>% 
  do.call("inner_join", args = .) %>% 
  mutate(
    dict_gruendl = factor(antielite_gruendl, c(1, 0), c("pos", "neg")),
    dict_rooduijn = factor(antielite_rooduijn, c(1, 0), c("pos", "neg"))
  ) 

# read the CHES data ----

# note: I omit the 2017 flash wave because it only records data for Germany 

# list of CHES metadata
ches <- list(
  "2014" = list(
    csv_file = "https://www.chesdata.eu/s/2014_CHES_dataset_means.csv"
    , vars = c("country" = "cname", "party_id", "party" = "party_name", "lrgen", "antielite_salience")
    , field_time_end = ymd("2014-11-30")
    , country_mapping_col = 2
  )
  , "2019" = list(
    csv_file = "https://www.chesdata.eu/s/CHES2019V3.csv"
    , vars = c("country", "party_id", "party", "lrgen", "antielite_salience")
    , field_time_end = ymd("2020-01-31")
    , country_mapping_col = 3
  )
)

ches_country_abbreviations <- tibble::tribble(
  ~countr_iso3c, ~ches2014, ~ches2019,
  "AUS",  NA ,    NA,
  "AUT", "aus",   13L,
  "BEL", "bel",    1L,
  "CAN",   NA ,   NA ,
  "DNK", "den",    2L,
  "FIN", "fin",    14,
  "FRA", "fra",    6L,
  "DEU", "ger",    3L,
  "GRC", "gre",    4L,
  "IRL", "ire",    7L,
  "ITA",  "it",    8L,
  "LUX", "lux",   38L,
  "NLD", "net",   10L,
  "NZL",   NA ,   NA ,
  "NOR", "nor",   35L,
  "PRT", "por",   12L,
  "ESP", "spa",    5L,
  "SWE", "swe",   16L,
  "CHE", "swi",   36L,
  "GBR",  "uk",   11L,
)

read_ches_data <- function(args) {
  args$data <- read_csv(args$csv_file) %>% 
    select(!!args$vars) %>% 
    left_join(
      ches_country_abbreviations %>% 
        select(1, args$country_mapping_col) %>% 
        rename_at(2, ~"country")
      , by = "country"
    )
  args$country_mapping_col <- NULL
  args$downloaded_at <- now()
  
  return(args)
}

# read CHES data
ches_data <- map(ches, read_ches_data)

ches_estimates <- ches_data %>% 
  map("data") %>% 
  map_dfr(select, -1, .id = "year") 

# map pre-12-month measurements to CHES estimates ----

ches_party_averages <- ches %>% 
  map("field_time_end") %>% 
  imap_dfr(function(edate, .year) {
    party_mean <- dict_matches_df %>% 
      filter(
        as_date(created_at) <= edate
        , as_date(created_at) >= (edate-months(12))
      ) %>% 
      group_by(country_iso3c, party_id, party_name_short, year = .year) %>% 
      summarise(
        n_tweets = n()
        # classifier-based estimates
        , prop_elitecriticism = mean(prob_elitecriticism > .5, na.rm = TRUE)
        , mean_prob_elitecriticism = mean(prob_elitecriticism, na.rm = TRUE)
        # dictionary-based estimates
        , prop_anitelite_dict_gruendl = mean(antielite_gruendl, na.rm = TRUE)
        , prop_anitelite_dict_rooduijn = mean(antielite_rooduijn, na.rm = TRUE)
        , .groups = "keep"
      ) %>% 
      ungroup() 
     
    party_mean %>% 
      left_join(
        select(party_codes, country_iso3c, party_id, party_name_short, party_id_ches, to_keep)
        , by = c("country_iso3c", "party_id", "party_name_short")
      ) %>% 
      left_join(
        select(ches_data[[.year]]$data, -country)
        , by = c("country_iso3c" = "countr_iso3c", "party_id_ches" = "party_id")
      )
  })

# write to disk
fp <- file.path(output_path, "validation", "party_averages_dictionaries_vs_ches_estimates.rds")
if (!file.exists(fp))
  write_rds(ches_party_averages, fp)

# inspect results as function of number of quarters aggregated ----

n_quarters <- c(8, 6, 4, 2)

tmp <- map(n_quarters, function(.q) {
    imap_dfr(
      map(ches, "field_time_end")
      , function(edate, .year) {
        party_mean <- dict_matches_df %>% 
          filter(
            as_date(created_at) <= edate
            , as_date(created_at) >= (edate-months(.q*3))
          ) %>% 
          group_by(country_iso3c, party_id, party_name_short, year = .year) %>% 
          summarise(
            n_tweets = n()
            # classifier-based estimates
            , prop_elitecriticism = mean(prob_elitecriticism > .5, na.rm = TRUE)
            , mean_prob_elitecriticism = mean(prob_elitecriticism, na.rm = TRUE)
            # dictionary-based estimates
            , prop_anitelite_dict_gruendl = mean(antielite_gruendl, na.rm = TRUE)
            , prop_anitelite_dict_rooduijn = mean(antielite_rooduijn, na.rm = TRUE)
            , .groups = "keep"
          ) %>% 
          ungroup() 
        
        party_mean %>% 
          left_join(
            select(party_codes, country_iso3c, party_id, party_name_short, party_id_ches, to_keep)
            , by = c("country_iso3c", "party_id", "party_name_short")
          ) %>% 
          left_join(
            select(ches_data[[.year]]$data, -country)
            , by = c("country_iso3c" = "countr_iso3c", "party_id_ches" = "party_id")
          )
      })
  })

# function to compute correlations
compute_correlations <- function(x, .q) {
  x %>% 
    group_by(year) %>% 
    summarize(
      corr_clf = cor(antielite_salience, mean_prob_elitecriticism, use = "pairwise.complete.obs")
      , corr_dict_gruendl = cor(antielite_salience, prop_anitelite_dict_gruendl, use = "pairwise.complete.obs")
      , corr_dict_rooduijn = cor(antielite_salience, prop_anitelite_dict_rooduijn, use = "pairwise.complete.obs")
    ) %>% 
    mutate(quarters = .q)
}
names(tmp) <- c(8, 6, 4, 2)

correlations_by_n_quarters <- imap_dfr(tmp, compute_correlations)

# write to disk
fp <- file.path(output_path, "validation", "ches_dictionary_correlations_detailed.csv")
if (!file.exists(fp))
  write_csv(correlations_by_n_quarters, fp)
